* 1) Keep only sales of new properties to match with HCA data ----

use "${datadir}\landreg_2013-2020_prevnew", clear 

keep if new
ren price purchaseprice

// avoid duplicates or undistinguishable properties
bysort postcode purchaseprice dated (transaction_id): keep if _n == _N

save "${datadir}\landreg_2013-2020_newonly", replace



* 2) Match HCA and Land Registry new sales ----

use "${datadir}\HCA_full.dta", clear
ren dated dated_hca

// avoid duplicates or undistinguishable properties
bysort pcd purchaseprice dated (schemeid): keep if _n == _N

joinby pcd purchaseprice using ///
	"${datadir}\landreg_2013-2020_newonly" 
	
// unique schemeid
gen diff_date = dated - dated_hca
bysort schemeid: egen mindiff = min(abs(diff_date))
keep if abs(diff_date) == mindiff

// unique unit_id dated
bysort unit_id dated: egen mindiff2 = min(abs(diff_date))
keep if abs(diff_date) == mindiff2

sum diff_date, det
keep if inrange(diff_date, r(p1), r(p99))

// eliminate duplicates
bysort schemeid (transaction_id): keep if _n == _N
bysort transaction_id (schemeid): keep if _n == _N

save "${datadir}\landreg_2013-2020_newonly_matched", replace



* 3) Go back to Land Reg dataset and bring in matched ----

use "${datadir}\landreg_2013-2020_prevnew", clear 

merge 1:1 transaction_id using ///
	"${datadir}\landreg_2013-2020_newonly_matched"

save "${datadir}\landreg_2013-2020_matched_big", replace
